library(xgboost) # for xgboost
library(tidyverse) # general utility functions
diseaseInfo <- read_csv("C:/Users/User/Desktop/university/machine learning/machine learning - salini/dataset/Outbreak_240817.csv")
Parsed with column specification:
cols(
.default = col_character(),
Id = col_double(),
latitude = col_double(),
longitude = col_double(),
sumAtRisk = col_double(),
sumCases = col_double(),
sumDeaths = col_double(),
sumDestroyed = col_double(),
sumSlaughtered = col_double(),
humansAge = col_double(),
humansAffected = col_double(),
humansDeaths = col_double()
)
See spec(...) for full column specifications.
The core xgboost function requires data to be a matrix. A matrix is like a dataframe that only has numbers in it. A sparse matrix is a matrix that has a lot zeros in it. XGBoost has a built-in datatype, DMatrix, that is particularly good at storing and accessing sparse matrices efficiently.
head(diseaseInfo)
our data will need some cleaning before it’s ready to be put in a matrix. To prepare our data, we have a number of steps we need to complete:
diseaseInfo_humansRemoved <- diseaseInfo %>% select(-starts_with("human")) # get the subset of the dataframe that doesn't have labels about humans affected by the disease
Let’s create a new vector with the labels
diseaseLabels <- diseaseInfo %>%
select(humansAffected) %>% # get the column with the # of humans affected
is.na() %>% # is it NA?
magrittr::not() # switch TRUE and FALSE (using function from the magrittr package)
# check out the first few lines
head(diseaseLabels) # of our target variable
humansAffected
[1,] FALSE
[2,] FALSE
[3,] FALSE
[4,] FALSE
[5,] FALSE
[6,] FALSE
head(diseaseInfo$humansAffected) # of the original column
[1] NA NA NA NA NA NA
diseaseInfo_numeric <- diseaseInfo_humansRemoved %>%
select(-Id) %>% # the case id shouldn't contain useful information
select(-c(longitude, latitude)) %>% # location data is also in country data
select_if(is.numeric) # select remaining numeric columns
# make sure that our dataframe is all numeric
str(diseaseInfo_numeric)
tibble [17,008 x 5] (S3: tbl_df/tbl/data.frame)
$ sumAtRisk : num [1:17008] 248000 122 1283 NA NA ...
$ sumCases : num [1:17008] 12 6 112 1 1 1 19 2 1600 5 ...
$ sumDeaths : num [1:17008] 12 1 0 1 1 1 19 2 0 5 ...
$ sumDestroyed : num [1:17008] 50000 0 NA 0 NA NA 0 0 4000 0 ...
$ sumSlaughtered: num [1:17008] 0 0 7 0 NA NA 0 0 0 0 ...
head(diseaseInfo$country)
[1] "South Africa" "Russian Federation" "Zimbabwe" "South Africa"
[5] "Czech Republic" "Czech Republic"
model.matrix(~country-1,head(diseaseInfo)) # one-hot matrix for just the first few rows of the "country" column
countryCzech Republic countryRussian Federation countrySouth Africa countryZimbabwe
1 0 0 1 0
2 0 1 0 0
3 0 0 0 1
4 0 0 1 0
5 1 0 0 0
6 1 0 0 0
attr(,"assign")
[1] 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$country
[1] "contr.treatment"
region <- model.matrix(~country-1,diseaseInfo)
# check out the first few lines of the species
head(diseaseInfo$speciesDescription)
[1] "domestic, unspecified bird" "domestic, swine" "domestic, cattle"
[4] "wild, unspecified bird" "wild, wild boar" "wild, wild boar"
diseaseInfo_numeric$is_domestic <- str_detect(diseaseInfo$speciesDescription, "domestic")
# grab the last word of each row and use that to create a one-hot matrix of different species
# get a list of all the species by getting the last
speciesList <- diseaseInfo$speciesDescription %>%
str_replace("[[:punct:]]", "") %>% # remove punctuation (some rows have parentheses)
str_extract("[a-z]*$") # extract the least word in each row
# convert our list into a dataframe...
speciesList <- tibble(species = speciesList)
# and convert to a matrix using 1 hot encoding
options(na.action='na.pass') # don't drop NA values!
species <- model.matrix(~species-1,speciesList)
# add our one-hot encoded variable and convert the dataframe into a matrix
diseaseInfo_numeric <- cbind(diseaseInfo_numeric, region, species)
diseaseInfo_matrix <- data.matrix(diseaseInfo_numeric)
# get the numb 70/30 training test split
numberOfTrainingSamples <- round(length(diseaseLabels) * .7)
# training data
train_data <- diseaseInfo_matrix[1:numberOfTrainingSamples,]
train_labels <- diseaseLabels[1:numberOfTrainingSamples]
# testing data
test_data <- diseaseInfo_matrix[-(1:numberOfTrainingSamples),]
test_labels <- diseaseLabels[-(1:numberOfTrainingSamples)]
# put our testing & training data into two seperates Dmatrixs objects
dtrain <- xgb.DMatrix(data = train_data, label= train_labels)
dtest <- xgb.DMatrix(data = test_data, label= test_labels)
set.seed(1234)
diseaseInfo <- diseaseInfo[sample(1:nrow(diseaseInfo)), ]
model <- xgboost(data = dtrain,
nround = 2, # max number of boosting iterations
objective = "binary:logistic") # objective function
[1] train-error:0.019654
[2] train-error:0.019654
# generate predictions for our held-out testing data
pred <- predict(model, dtest)
# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
[1] "test-error= 0.000980007840062721"
# train an xgboost model
model_tuned <- xgboost(data = dtrain,
max.depth = 3, # maximum depth of each decision tree
nround = 2, # max number of boosting iterations
objective = "binary:logistic") # objective function
[1] train-error:0.019654
[2] train-error:0.019654
# generate predictions for our held-out testing data
pred <- predict(model_tuned, dtest)
# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
[1] "test-error= 0.000980007840062721"
There are two things we can try to see if we improve our model performance: - Account for the fact that we have imbalanced classes - Train for more rounds
# get the number of negative & positive cases in our data
negative_cases <- sum(train_labels == FALSE)
postive_cases <- sum(train_labels == TRUE)
# train a model using our training data
model_tuned <- xgboost(data = dtrain,
max.depth = 3, # maximum depth of each decision tree
nround = 10, # number of boosting rounds
early_stopping_rounds = 3, # if we don't see an improvement in this many rounds, stop
objective = "binary:logistic", # objective function
scale_pos_weight = negative_cases/postive_cases) # control for imbalanced classes
[1] train-error:0.020410
Will train until train_error hasn't improved in 3 rounds.
[2] train-error:0.020494
[3] train-error:0.020494
[4] train-error:0.020914
Stopping. Best iteration:
[1] train-error:0.020410
# generate predictions for our held-out testing data
pred <- predict(model_tuned, dtest)
# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
[1] "test-error= 0.00392003136025088"
… TODO
# train a model using our training data
model_tuned <- xgboost(data = dtrain,
max.depth = 3, # maximum depth of each decision tree
nround = 10, # number of boosting rounds
early_stopping_rounds = 3, # if we don't see an improvement in this many rounds, stop
objective = "binary:logistic", # objective function
scale_pos_weight = negative_cases/postive_cases, # control for imbalanced classes
gamma = 1) # add a regularization term
[1] train-error:0.020410
Will train until train_error hasn't improved in 3 rounds.
[2] train-error:0.020494
[3] train-error:0.020494
[4] train-error:0.020914
Stopping. Best iteration:
[1] train-error:0.020410
# generate predictions for our held-out testing data
pred <- predict(model_tuned, dtest)
# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
[1] "test-error= 0.00392003136025088"
# plot them features! what's contributing most to our model?
xgb.plot.multi.trees(feature_names = names(diseaseInfo_matrix),
model = model)
Column 2 ['No'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for backwards compatibility. See news item 5 in v1.12.2 for options to control this message.
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Because we’re using a logistic model here, it’s telling us the log-odds rather than the probability
# convert log odds to probability
odds_to_probs <- function(odds){
return(exp(odds) / (1 + exp(odds)))
}
# probability of leaf above countryPortugul
odds_to_probs(-0.599)
[1] 0.3545725
# get information on how important each feature is
importance_matrix <- xgb.importance(names(diseaseInfo_matrix), model = model)
# and plot it!
xgb.plot.importance(importance_matrix)
# diseaseInfo_numeric.pca <- prcomp(diseaseInfo_numeric[, c(1:7,10,11)],
# center = TRUE,
# scale = TRUE)
diseaseInfo_numeric